# source("baflights19.R")
library(tidyverse)
library(data.table)
library(purrr)
library(arsenal)
library(labelled)
library(gtsummary)
library(tibble)
library(sjlabelled)
library(expss)
library(knitr)
library(pacman)
p_load(tidyverse, nycflights13, tictoc)
The baflights19.R saves each tibble as an .Rds file within the data subdirectory. To load the tibbles, the read_rds function from the readr R package is employed.
airlines_new <- read_rds("C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/airlines.Rds")
airports_new <- read_rds("C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/airports.Rds")
flights_new <- read_rds("C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/flights.Rds")
planes_new <- read_rds("C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/planes.Rds")
weather_new <- read_rds("C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/weather.Rds")
flights
flights_new
fs::dir_ls("C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data")
## C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/airlines.Rds
## C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/airports.Rds
## C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/flights.Rds
## C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/planes.Rds
## C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/weather.Rds
Part I.
dep_flights <- flights_new %>% filter(!is.na(arr_delay)) %>% select(month, origin) %>% filter(month==1) %>% mutate(n= map_int(month,length)) %>% drop_na()
dep_flights
dim(dep_flights)
## [1] 22039 3
A total of 22,039 flights departed in January.
dep_flights %>% filter(origin=="SFO") %>% dim()
## [1] 13078 3
In January, San Francisco Airport recorded 13,078 flights.
dep_flights %>% filter(origin=="SJC") %>% dim()
## [1] 4841 3
In January, San Jose Airport registered 4,841 flights.
dep_flights %>% filter(origin=="OAK") %>% dim()
## [1] 4120 3
In January, Oakland Airport logged 4,120 flights.
comparedf(flights,flights_new)
## Compare Object
##
## Function Call:
## comparedf(x = flights, y = flights_new)
##
## Shared: 19 non-by variables and 286933 observations.
## Not shared: 0 variables and 49843 observations.
##
## Differences found in 19/19 variables compared.
## 1 variables compared have non-identical attributes.
#summary(comparedf(flights,flights_new))
``
data(flights)
flights = apply_labels(flights,
year = "Year of departure.",
month = "Month of departure.",
day = "Day of departure.)",
dep_time ="Actual departure (format HHMM or HMM), local tz.",
sched_dep_time = "Scheduled departure (format HHMM or HMM), local tz.",
dep_delay = "Departure delays, in minutes. Negative times represent early departures.",
arr_time = "Actual arrival times (format HHMM or HMM), local tz.",
sched_arr_time = "Scheduled arrival times (format HHMM or HMM), local tz.",
arr_delay = "Arrival delays, in minutes. Negative times represent early departures/arrivals.",
carrier = "Two letter carrier abbreviation. See airlines to get name.",
flight = "Flight number.",
tailnum = "Plane tail number. See planes for additional metadata.",
origin = "Origin. See airports for additional metadata.",
dest = "destination. See airports for additional metadata.",
air_time = "Amount of time spent in the air, in minutes.",
distance = "Distance between airports, in miles.",
hour = "Time of scheduled departure broken into hour.",
minute = "Time of scheduled departure broken into minutes.",
time_hour = "Scheduled date and hour of the flight as a POSIXct date. Along with origin, can be used to join flights data to weather data."
)
codebook <- enframe(get_label(flights))
kable(codebook)
| name | value |
|---|---|
| year | Year of departure. |
| month | Month of departure. |
| day | Day of departure.) |
| dep_time | Actual departure (format HHMM or HMM), local tz. |
| sched_dep_time | Scheduled departure (format HHMM or HMM), local tz. |
| dep_delay | Departure delays, in minutes. Negative times represent early departures. |
| arr_time | Actual arrival times (format HHMM or HMM), local tz. |
| sched_arr_time | Scheduled arrival times (format HHMM or HMM), local tz. |
| arr_delay | Arrival delays, in minutes. Negative times represent early departures/arrivals. |
| carrier | Two letter carrier abbreviation. See airlines to get name. |
| flight | Flight number. |
| tailnum | Plane tail number. See planes for additional metadata. |
| origin | Origin. See airports for additional metadata. |
| dest | destination. See airports for additional metadata. |
| air_time | Amount of time spent in the air, in minutes. |
| distance | Distance between airports, in miles. |
| hour | Time of scheduled departure broken into hour. |
| minute | Time of scheduled departure broken into minutes. |
| time_hour | Scheduled date and hour of the flight as a POSIXct date. Along with origin, can be used to join flights data to weather data. |
3. Identify the month with the highest proportion of cancelled flights and the month with the lowest proportion.
flights2 <- flights_new %>%
group_by(month) %>%
summarize(cancelled = sum(is.na(arr_delay)),total = n(), prop_cancelled = cancelled/total)
flights2
ggplot(data=flights2,aes(x=month,y=prop_cancelled)) + geom_point() + labs(title = " Proportion of Cancelled Flights Each Month", y = " proportion cancelled")
February exhibited the highest proportion of cancelled flights, whereas October had the lowest. Notably, there appears to be a notable spike in cancellation proportions during the winter and spring months (January, February, March, April) as well as during the summer and autumn months (June, July, October). These patterns suggest that specific circumstances during these seasons likely influence flight cancellations, potentially attributable to severe weather conditions.
4. Identify the aircraft (specified by the tailnum variable) that made the most trips from Bay Area airports in 2019. Additionally, create a plot illustrating the number of trips per week throughout the year.
flights_new %>%
filter(!is.na(tailnum)) %>%
group_by(tailnum) %>%
summarise(count = n()) %>% arrange(desc(count))%>%
head()
The aircraft N521VA had the highest number of trips from Bay Area airports in 2019.
flights2 <- flights_new %>%
filter(tailnum == "N521VA") %>%
mutate(week=week(time_hour)) %>%
group_by(week) %>%
summarise(count=n())
ggplot(data=flights2, aes(x=week,y=count)) +geom_point() + geom_line() + labs(title="Number of Trip from Bay Area for Plane N521VA Each Week")
5. Determine the oldest aircraft (identified by thetailnum variable) that departed from Bay Area airports in 2019. Additionally, ascertain the total count of aircraft that flew from Bay Area airports and are included in the planes table.
planes2 <- select(planes_new,tailnum,year, manufacturer)
flights2 <- select(flights_new,tailnum)
Bay_flights <- left_join(planes2,flights2) %>%
arrange(year) %>%
unique() %>% arrange(year)
## Joining, by = "tailnum"
head(Bay_flights)
The aircraft with the tail number N990JB, manufactured in 1977, is the oldest among those that flew from Bay Area airports in 2019.
Bay_flights2 <- distinct(Bay_flights)
nrow(Bay_flights2)
## [1] 4031
There are a total of 4031 distinct airplanes included in the dataset.
Determine the count of airplanes for which the date of manufacture is missing.
Bay_flights2 <- Bay_flights %>%
filter(is.na(year)) %>%
distinct(tailnum)
nrow(Bay_flights2)
## [1] 79
A total of 79 airplanes lack information regarding their date of manufacture.
6. Identify the top five manufacturers with the highest frequency of aircraft. Additionally, analyze whether there have been changes in the distribution of manufacturers over time, as evidenced by the airplanes flying from the Bay Area in 2019.
manufacturers <- Bay_flights %>%
select(manufacturer,tailnum,year) %>%
unique() %>%
group_by() %>%
group_by(manufacturer) %>%
summarise(count=n()) %>%
arrange(desc(count))
manufacturers
The top five manufacturers by frequency are Boeing, Airbus, Bombardier Inc, Embraer S A, C Series Aircradt LTD PTNRSP, Cessna
manufacturers <- Bay_flights %>%
mutate(manufacturer =
case_when(
manufacturer=="AIRBUS INDUSTRIE" ~"AIRBUS",
manufacturer=="AIRBUS CANADA LTD PTNRSP"~"AIRBUS",
manufacturer=="AIRBUS SAS" ~ "AIRBUS",
manufacturer=="EMBRAER S A"~"EMBRAER",TRUE~manufacturer)) %>%
select(manufacturer,tailnum,year) %>%
unique() %>%
group_by(manufacturer)%>%
summarise(count=n())%>%
arrange(desc(count))
manufacturers
Bay_flights2 <- left_join(manufacturers,Bay_flights) %>%
mutate(manu = ifelse(count >100, manufacturer,"Other"))
## Joining, by = "manufacturer"
Bay_flights2 %>%
group_by(manu)%>%
summarize(avgyear = mean( year, na.rm = TRUE)) %>%
arrange(desc(avgyear))
In historical usage, the predominant aircraft manufacturers have been Boeing and Bombardier, with Airbus and Embraer specializing in newer aircraft models.
PART II:
1. The inquiry concerns the temperature distribution in July 2013. It aims to identify significant outliers in terms of wind speed. Additionally, it seeks to analyze the relationship between dew and humidity, as well as the relationship between precipitation and visibility.
head(weather_new)
weather_new %>%
filter(month == 7) %>%
ggplot(aes(x = temp)) +
geom_histogram() +
labs(x = "Temperature", y = "Counts", title = "Distribution of temperature in July, 2019")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2230 rows containing non-finite values (stat_bin).
# Calculate the quartile
qua_windspeed <- quantile(weather_new$wind_speed, na.rm = TRUE)
qua_windspeed
## 0% 25% 50% 75% 100%
## 0.00000 4.60312 8.05546 12.65858 41.42808
#Calculate the interquartile range
IQR <- as.numeric(qua_windspeed[4]) - as.numeric(qua_windspeed[2])
IQR
## [1] 8.05546
weather_new %>%
filter((wind_speed >= as.numeric(qua_windspeed[4]) + 3*IQR) | (wind_speed <= as.numeric(qua_windspeed[2]) - 3*IQR)) %>%
arrange(desc(wind_speed))
boxplot(weather_new$wind_speed)
2. The investigation involves determining the number of days with precipitation in the Bay Area during 2019. Additionally, it examines potential disparities in mean visibility (visib) concerning both the day of the week and the month of the year.
weather_new %>%
select(day,precip,origin)%>%
filter(precip == 0 & origin == c("SFO","OAK","SJC")) %>%
summarise( n=n(),sum(n))
## Warning in origin == c("SFO", "OAK", "SJC"): longer object length is not a
## multiple of shorter object length
The count of rainless days totals 48. There were 317 days with precipitation in the Bay Area in 2019.
Avg_vib_day <- weather_new %>%
mutate(Dayofwk = wday(time_hour)) %>%
select(Dayofwk, visib ) %>%
filter(!is.na(visib)) %>%
group_by(Dayofwk) %>%
summarise(Avg_day = mean(visib))
Avg_vib_day
Avg_vib_mth <- weather_new %>%
select(month, visib ) %>%
filter(!is.na(visib)) %>%
group_by(month) %>%
summarise(Avg_mth = mean(visib))
Avg_vib_mth
Mean visibility varied notably based on the day of the week and month of the year, particularly during November and December.